# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #  
#
#' @title  Compile R objects used in the paper, supporting materials, and 
#'          data appendix	
#' @author Hauke Licht
#
# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #

# setup ----

# load required packages

# # a) data import and wrangling
library(readr)

# # b) data wrangling
library(tibble)
library(dplyr)
library(tidyr)
library(lubridate)
library(purrr)
library(stringr)

# # c) visualization
library(ggplot2)
library(ggridges)
library(viridis)
library(patchwork)

# # d) analyses
library(plm)

# set directory paths
base_path <- file.path(".")
data_path <- file.path(base_path, "data")
exdata_path <- file.path(data_path, "exdata")
input_path <- file.path(data_path, "input")
fits_path <- file.path(data_path, "fits")
codings_path <- file.path(data_path, "intermediate", "codings")
labelings_path <- file.path(data_path, "intermediate", "labelings")
output_path <- file.path(data_path, "output")

# load custom helper functions 
helpers_path <- file.path(base_path, "code", "helpers")
source(file.path(helpers_path, "utils.R"))
source(file.path(helpers_path, "krippendorffs_alpha.R"))
source(file.path(helpers_path, "plot_setup.R"))

# create or read "paper_objects" object 
paper_objects_fp <- file.path(output_path, "paper_objects.rds") # !!!
if (file.exists(paper_objects_fp)) {
  paper_objects <- read_rds(paper_objects_fp)
} else {
  list() -> 
    paper_objects -> 
    paper_objects$desc -> 
    paper_objects$tables -> 
    paper_objects$figures -> 
    paper_objects$data -> 
    paper_objects$misc 
}


# inspect  
str(paper_objects, 1)

# 1. DATA COLLECTION ----

## a)  Parliamentary parties tweets ----
paper_objects$data$countries <- countries <- read_csv(file.path(exdata_path, "countries.csv"))

# load parliamentary tweets dataset 
parl_party_tweets <- read_rds(file.path(input_path, "parl_party_tweets.rds"))

parl_party_tweets$screen_name[parl_party_tweets$screen_name == "FratellidItaIia"] <- "FratellidItalia"

### tweets data descriptives ----

paper_objects$desc$tweets <- list()

# country level
tmp <- left_join(count(parl_party_tweets, country_iso3c) , countries)

paper_objects$desc$tweets$countries_nobs <- setNames(tmp$n, tmp$country_iso3c)
paper_objects$desc$tweets$countries <- sort(setNames(tmp$country_name, tmp$country_iso3c))

# party level
paper_objects$desc$tweets$n_parties <- nrow(count(parl_party_tweets, country_iso3c, party_name_short, party_id))

# corpus level
paper_objects$desc$tweets$first_tweet_date <- trimws(format(lubridate::as_date(min(parl_party_tweets$created_at)), "%e. %B %Y"))
paper_objects$desc$tweets$last_tweet_date <- trimws(format(lubridate::as_date(max(parl_party_tweets$created_at)), "%e. %B %Y"))


## b) Parliamentary configurations ----

new_configs <- read_csv(file.path(exdata_path, "new_elections.csv"))

# basis for data appendix table S3
paper_objects$tables$parl_configs_manually_added <- new_configs %>% 
  mutate(
    party_str = case_when(
      party_pm & party_in_cabinet ~ sprintf("\\underline{\\textbf{%s}}", party_name_short)
      , party_pm ~ sprintf("\\textbf{%s}", party_name_short)
      , party_in_cabinet ~ sprintf("\\underline{%s}", party_name_short)
      , TRUE ~ party_name_short
    )
  ) %>% 
  arrange(country_iso3c, election_date, parl_config_sdate, parl_config_edate, cabinet_formation, desc(seats)) %>% 
  group_by(country_iso3c, election_id, election_date, parl_config_sdate, parl_config_edate, cabinet_formation) %>% 
  summarise(
    seats_total = unique(seats_total)
    , parties = paste(unique(sprintf("%s~(%d)", party_str, replace_na(seats, 0L))), collapse = ", ")
  ) %>% 
  ungroup()

parties_by_parl_configs <- parl_party_tweets %>%
  group_by(
    country_iso3c
    , party_name_english, party_name_short, party_id
    , screen_name
    , elec_period_sdate, elec_period_edate, pre_elec_period
    , parl_config_sdate, parl_config_edate
  ) %>%
  summarise(
    seats = unique(seats)
    , seats_total = unique(seats_total)
    , seat_share = unique(seats/seats_total)
    , seat_share_str = unique(sprintf("%s/%s", replace_na(as.character(unique(seats)), "-"), replace_na(as.character(unique(seats_total)), "-")))
    , cab_party = unique(party_in_cabinet)
    , pm_party = unique(party_pm)
    , n_tweets = n_distinct(status_id)
    , first_tweet = min(lubridate::as_date(created_at))
    , last_tweet = max(lubridate::as_date(created_at))
  ) %>%
  group_by(
    country_iso3c
    , party_name_english, party_name_short, party_id
    , elec_period_sdate, elec_period_edate, pre_elec_period
    , parl_config_sdate, parl_config_edate
    , .add = FALSE
  ) %>%
  summarise(
    screen_names = paste(sort(unique(screen_name)), collapse = ", ")
    , seats = sum(seats)
    , seats_total = unique(seats_total)
    , seat_share = sum(seat_share)
    , seat_share_str = NA_character_
    , cab_party = any(cab_party)
    , pm_party = any(pm_party)
    , n_tweets = sum(n_tweets)
    , first_tweet = min(first_tweet)
    , last_tweet = max(last_tweet)
  ) %>%
  mutate(
    seat_share_str = sprintf(
      "%s/%s",
      replace_na(as.character(seats), "-"),
      replace_na(as.character(seats_total), "-")
    )
  ) %>%
  ungroup()

parl_configs <- parties_by_parl_configs %>%
  mutate(
    party_str = case_when(
      pm_party & cab_party ~ sprintf("\\underline{\\textbf{%s}}", party_name_short)
      , pm_party ~ sprintf("\\textbf{%s}", party_name_short)
      , cab_party ~ sprintf("\\underline{%s}", party_name_short)
      , TRUE ~ party_name_short
    )
  ) %>% 
  group_by(
    country_iso3c
    , elec_period_sdate, elec_period_edate, pre_elec_period
    , parl_config_sdate, parl_config_edate
  ) %>% 
  arrange(country_iso3c, parl_config_sdate, desc(seat_share), party_name_short) %>% 
  summarise(
    n_tweets = sum(n_tweets)
    , seats_total = unique(seats_total)
    , n_parties = n_distinct(party_id, party_name_short)
    , parties = paste(unique(sprintf("%s~(%d)", party_str, replace_na(seats, 0))), collapse = ", ")
    , first_tweet = min(first_tweet)
    , last_tweet = max(last_tweet)
  ) %>% 
  ungroup()
  
# basis for data appendix table S2
paper_objects$tables$pre_elec_period_configs <- parl_configs %>% 
  filter(pre_elec_period) %>% 
  transmute(
    country_iso3c
    , election_date = elec_period_edate + days(1)
    , n_parties, parties = gsub("~(0)", "", parties, fixed = TRUE)
    , elec_period_sdate, elec_period_edate
    , n_tweets
    , first_tweet, last_tweet
  )

# basis for data appendix table S1
paper_objects$tables$parl_configs <- parl_configs %>% 
  filter(!pre_elec_period) %>% 
  transmute(
    country_iso3c
    , election_date = elec_period_sdate
    , seats_total, n_parties, parties
    , parl_config_sdate, parl_config_edate
    , n_tweets
    , first_tweet, last_tweet
  )

## c) Parties ----

paper_objects$tables$parties <- parl_party_tweets %>% 
  group_by(
    country_iso3c
    , party_name_short, party_name_english, party_id
    , screen_name, user_id
    , pre_elec_period, elec_period_sdate, elec_period_edate
  ) %>% 
  summarise(
    n_tweets = n_distinct(status_id)
    , tweets_added_in_2023 = year(median(data_collected_at)) == 2023
  ) %>% 
  arrange(country_iso3c, party_name_short, elec_period_sdate) %>% 
  group_by(
    country_iso3c
    , party_name_short, party_name_english, party_id
    , screen_name, user_id
  ) %>% 
  summarise(
    periods = paste(
      sprintf(
        ifelse(
          pre_elec_period
          , "\\textit{%s~--~%s}~(%d)"
          , "%s~--~%s~(%d)"
        )
        , elec_period_sdate, elec_period_edate, n_tweets
      )
      , collapse = ", "
    ) %>% gsub("~NA(\\}?)~", "\\1~", x = ., perl = TRUE)
    , n_tweets = sum(n_tweets)
    , tweets_added_in_2023 = any(tweets_added_in_2023)
  ) 

# basis for supporting materials table S3
paper_objects$tables$new_party_accounts <- read_tsv(file.path(exdata_path, "twitter_accounts_added_in_2023.tab"))

# 2. SAMPLING -----

paper_objects$desc$sampling <- list()

# note: we sampled tweets for annotation before adding/updating/merging some accounts (in May/June 2023)
# hence we go with this old file
fp <- file.path(input_path, "all_tweets_classified_political.rds")
classified_tweets <- read_rds(fp) 
classified_tweets <- classified_tweets %>% 
  filter(collected_posthoc == "no") %>% 
  mutate(is_en = ifelse(is.na(is_en), FALSE, is_en))

## a) eligibility ----

paper_objects$tables$sampling_eligibility_sample1 <- classified_tweets %>% 
  count(political, is_en) %>% 
  mutate_if(is.factor, as.character) 

## b) clusters ----

k_means500 <- read_rds(file.path(fits_path, "political_en_tweets_clusters.rds"))

paper_objects$desc$sampling$cluster_sizes <- tibble(cluster_id = 1:500, size = as.integer(k_means500$size))

# t-SNE 
paper_objects$desc$sampling$tsne_centroids <- read_rds(file.path(fits_path, "political_en_tweets_clusters_tsnes.rds"))

# clusters
clusters <- stats:::fitted.kmeans(k_means500, "class")

clustered_tweets <- tibble::enframe(clusters) %>%
  separate(name, c("user_id", "status_id"), sep = "_") %>%
  rename(cluster_id = value) %>%
  left_join(
    classified_tweets
    , by = c("user_id", "status_id")
    , relationship = "many-to-many"
  )

paper_objects$desc$sampling$cluster_unit_sizes <- clustered_tweets %>%
  group_by(cluster_id) %>%
  summarise(
    n_countries = n_distinct(country_iso3c)
    , n_parties = n_distinct(party_id)
    , cluster_size = n()
  ) %>%
  ungroup()


# 3. CROWD-CODING -----

paper_objects$desc$coding <- list()

label_map <-  c(
  "yes-general" = "Yes, general"
  , "yes-specific" = "Yes, specific"
  , "yes-unsure" = "Yes, but unsure"
  , "no" = "No"
  , "cannot-answer" = "Cannot answer"
)

## a) codings data ----

codings <- bind_rows(
  read_rds(file.path(codings_path, "codings_sample_1.rds"))
  , read_rds(file.path(codings_path, "codings_sample_2.rds"))
  , .id = "sample"
)

# number of tweets in/across samples 
n_tweets <- codings %>% 
  group_by(sample) %>% 
  summarise(n = n_distinct(status_id)) %>% 
  {setNames(.$n, .$sample)}
n_tweets["total"] <- sum(n_tweets)
(paper_objects$desc$coding$n_tweets <- n_tweets)

# number of labels in/across samples 
n_labels <- codings %>% 
  count(sample, coding) %>%
  split(.$sample) %>% 
  map(~setNames(.$n, .$coding))
(paper_objects$desc$coding$n_labels <- n_labels)

median_of_median_coding_duration <- codings %>%
  group_by(sample, worker_id) %>%
  summarise(median_duration = median(duration)) %>%
  group_by(sample) %>%
  summarise(
    median_median_duration = median(median_duration)
    , mean_median_duration = mean(median_duration)
    , sd_median_duration = sd(median_duration)
    , skew_median_duration = e1071::skewness(median_duration)
  ) %>%
  ungroup()

paper_objects$tables$sample_descriptives <- codings %>% 
  group_by(sample, status_id) %>% 
  summarise(n_judgments = n_distinct(worker_id)) %>% 
  group_by(sample) %>% 
  summarise(
    n_tweets = n_distinct(status_id)
    , judgments_per_tweet = mean(n_judgments)
    , n_judgments = sum(n_judgments)
  ) %>% 
  ungroup() %>% 
  mutate(per_unit_pay = c("$0.024", "$0.048")) %>% 
  left_join(
    summarize(group_by(codings, sample), n_coders = n_distinct(worker_id))
  ) %>% 
  left_join(median_of_median_coding_duration) %>% 
  select(
    sample
    , n_tweets
    , judgments_per_tweet, n_judgments, n_coders
    , per_unit_pay
    , ends_with("median_duration")
  )
  

## b) coder stats -----

paper_objects$data$coding$judgments_per_coder <- count(codings, sample, worker_id)

paper_objects$desc$coding$coder_durations <- codings %>% 
  group_by(sample, worker_id) %>% 
  summarise(start = min(start_time), end = max(end_time))

paper_objects$tables$coder_stats <- coder_stats <- codings %>% 
  group_by(sample, worker_id) %>% 
  summarize(
    n_judgments = n_distinct(item_id)
    , judgment_entropy = entropy(coding)
    , judgment_set = list(rev(sort(table(coding))))
    , mean_duration = mean(duration)
    , median_duration = median(duration)
    , q10_duration = quantile(duration, .10)
    , q25_duration = quantile(duration, .25)
    , q75_duration = quantile(duration, .75)
    , q90_duration = quantile(duration, .90)
    , badbadnotgood = case_when(
      n_judgments <= 5 ~ unique(worker_id)
      , n_judgments > 5 & judgment_entropy < .25  ~ unique(worker_id)
      , n_judgments > 50 & judgment_entropy < .5  ~ unique(worker_id)
      , n_judgments > 100 & judgment_entropy < 1  ~ unique(worker_id)
      , TRUE ~ NA_character_
    ) 
  ) %>% 
  ungroup()


## c) tweet stats ----

paper_objects$tables$tweet_stats <- codings %>% 
  group_by(sample, item_id, status_id, source) %>% 
  summarize(
    median_duration = median(duration)
    , mean_duration = mean(duration)
    , n_judgments = n_distinct(worker_id)
    , judgment_entropy = entropy(coding)
    , judgments = list(rev(sort(table(coding))))
    , mode_label = map_chr(map(judgments, names), 1)
    , mode_label_n = map_int(judgments, 1)
    , mode_is_tie = map(judgments, 2)
    , mode_is_tie = ifelse(map_lgl(mode_is_tie, is.null), 0L, map_int(mode_is_tie, 1))
    , mode_is_tie = mode_label_n == mode_is_tie
  ) %>% 
  ungroup() 


## d) codings data cleaning ----

# get IDs of "bad" coders
bad_coders <- paper_objects$tables$coder_stats %>%
  filter(!is.na(badbadnotgood)) %>%
  {split(.$worker_id, .$sample)}

# determine valid codings
codings <- mutate(codings, coding = ifelse(coding %in% names(label_map), coding, NA_character_))

paper_objects$tables$tweet_stats_no_bad_coders <- codings %>% 
  filter(
    !is.na(coding)
    , !(sample == "1" & worker_id %in% bad_coders[[1]])
    , !(sample == "2" & worker_id %in% bad_coders[[2]])
    , duration >= 4
  ) %>% 
  group_by(sample, item_id, status_id, source) %>% 
  summarize(
    median_duration = median(duration)
    , mean_duration = mean(duration)
    , n_judgments = n_distinct(worker_id)
    , judgment_entropy = entropy(coding)
    , judgments = list(rev(sort(table(coding))))
    , mode_label = map_chr(map(judgments, names), 1)
    , mode_label_n = map_int(judgments, 1)
    , mode_is_tie = map(judgments, 2)
    , mode_is_tie = ifelse(map_lgl(mode_is_tie, is.null), 0L, map_int(mode_is_tie, 1))
    , mode_is_tie = mode_label_n == mode_is_tie
  ) %>% 
  ungroup() 

# tabulate cleaning breakdown

data_cleaning_summary_helper <- function(x, step) {
  count(x, sample, coding) %>% 
    mutate_at(2, factor, names(label_map), label_map) %>% 
    mutate(step = step)
}

# baseline
dcs1 <- data_cleaning_summary_helper(codings, "")

# step 1
dcs2 <- codings %>% 
  filter(
    !is.na(coding)
    , !(sample == "1" & worker_id %in% bad_coders[[1]])
    , !(sample == "2" & worker_id %in% bad_coders[[2]])
  ) %>% 
  data_cleaning_summary_helper("codings of ``low-quality'' coders")

# step 2
tmp <- codings %>% 
  filter(
    !is.na(coding)
    , !(sample == "1" & worker_id %in% bad_coders[[1]])
    , !(sample == "2" & worker_id %in% bad_coders[[2]])
    , duration >= 4
  ) 

dcs3 <- data_cleaning_summary_helper(tmp, "judgments made in < 4 seconds")

# majority cannot-answer tweets
majority_ca_tweets <- tmp %>% 
  group_by(sample, status_id) %>% 
  summarise(
    judgments = list(rev(sort(table(coding))))
    , mode_label = map_chr(map(judgments, names), 1)
  ) %>% 
  ungroup() %>% 
  filter(mode_label == "cannot-answer")

# step 3
dcs4 <- tmp %>% 
  anti_join(majority_ca_tweets) %>% 
  data_cleaning_summary_helper("majority-``Cannot answer'' tweets")

# cleaned codings
codings_cleaned <- tmp %>% 
  anti_join(majority_ca_tweets) %>% 
  filter(coding != "cannot-answer")

# step 4
dcs5 <- codings_cleaned %>% 
  data_cleaning_summary_helper("other ``Cannot answer'' codings")


paper_objects$tables$judgment_data_cleaning <- bind_rows(dcs1, dcs2, dcs3, dcs4, dcs5) %>% 
  pivot_wider(names_from = c("coding"), values_from = "n") %>% 
  mutate(total = rowSums(.[,3:8], na.rm = T))


## e) inter-coder reliability ----

alpha_baseline <- list()
alpha_baseline$pooled <- codings %>% 
  select(item_id, worker_id, coding) %>% 
  pivot_wider(names_from = "item_id", values_from = "coding") %>% 
  krippendorffs_alpha()

alpha_baseline$sample1 <- codings %>% 
  filter(sample == "1") %>% 
  select(item_id, worker_id, coding) %>% 
  pivot_wider(names_from = "item_id", values_from = "coding") %>% 
  krippendorffs_alpha()

alpha_baseline$sample2 <- codings %>% 
  filter(sample == "2") %>% 
  select(item_id, worker_id, coding) %>% 
  pivot_wider(names_from = "item_id", values_from = "coding") %>% 
  krippendorffs_alpha()

alpha_cleaned <- list()
alpha_cleaned$pooled <- codings_cleaned %>% 
  select(item_id, worker_id, coding) %>% 
  pivot_wider(names_from = "item_id", values_from = "coding") %>% 
  krippendorffs_alpha()

alpha_cleaned$sample1 <- codings_cleaned %>% 
  filter(sample == "1") %>% 
  select(item_id, worker_id, coding) %>% 
  pivot_wider(names_from = "item_id", values_from = "coding") %>% 
  krippendorffs_alpha()

alpha_cleaned$sample2 <- codings_cleaned %>% 
  filter(sample == "2") %>% 
  select(item_id, worker_id, coding) %>% 
  pivot_wider(names_from = "item_id", values_from = "coding") %>% 
  krippendorffs_alpha()

paper_objects$desc$coding$alpha_baseline <- alpha_baseline
paper_objects$desc$coding$alpha_cleaned <- alpha_cleaned


# 4. LABELING ----

# load fitted Dawid-Skene model fits
paper_objects$misc$em_fit_s1 <- read_rds(file.path(fits_path, "dawidskene_model_sample_1.rds"))
# !!!
paper_objects$misc$em_fit_pooled <- read_rds(file.path(fits_path, "dawidskene_model_pooled_samples.rds"))

# a) labelings and training (sample 1) ----

# labelings
fp <- file.path(labelings_path, "dawidskene_labelings_sample_1.csv")
paper_objects$data$labelings$labeled_tweets_s1 <- read_csv(fp, col_types = "cciccccdidddddddcc")

# training data
fp <- file.path(data_path, "intermediate", "training", "training_data_sample_1.rds")
paper_objects$data$labelings$training_data_s1 <- read_rds(fp)

# classifier
glmnet_fit_path <- file.path(fits_path, "glmnet_classifier_sample_1_tweets.rds")
glmnet_grid_search <- read_rds(glmnet_fit_path)

paper_objects$misc$glmnet_s1_grid_search_results <- glmnet_grid_search$results


# 5. CLASSIFIER EVALUATION ----

# created in script 'code/03-classify/04-summarize_classifier_evaluations.R'
# note: the results in this TSV file are generated in python with 
#       scikit learn's classification_report() function
#       see https://muthu.co/understanding-the-classification-report-in-sklearn/
evaluated <- read_tsv(file.path(output_path, "classifier_results", "all_classifier_results.tab"))

paper_objects$data$classifiers_performances_binary_wide <- evaluated

table(evaluated$approach)
paper_objects$desc$classifiers$approach_map <- approach_map <- c(
  "LLM fine-tuning" = "transformer",
  "MSE" = "mse",
  "MT+BoW" = "bow"
)

table(evaluated$learner)
paper_objects$desc$classifiers$models_map <- models_map <- c(
  "xlmt" = "XLM-T",
  "nb" = "Naive Bayes",
  "svm" = "SVM",
  "per" = "Ridge regression",
  "mlp" = "MLP"
)

# get best results by approach
best_classifiers <- evaluated %>% 
  filter(what == "pos") %>% 
  group_by(approach) %>% 
  filter(`f1-score` == max(`f1-score`)) %>% 
  ungroup() %>% 
  distinct(approach, learner, features) %>% 
  left_join(evaluated) %>% 
  mutate(learner = ifelse(is.na(learner), "xlmt", learner))
  

paper_objects$tables$best_classifiers_performances <- best_classifiers %>%
  filter(grepl("avg", what)) %>%
  transmute(features, learner, f1 = `f1-score`, what = sub(" avg", "", what)) %>%
  pivot_wider(names_from = "what", values_from = "f1") %>%
  arrange(desc(macro)) %>%
  left_join(
    best_classifiers %>%
      filter(what == "pos") %>%
      select(features, precision, recall)
  ) %>%
  left_join(
    best_classifiers %>%
      filter(what == "neg") %>%
      select(features, specificity = recall)
  )

# 6. MEASUREMENTS ----

# load parties codes mapping (incl. 'to_keep' and 'is_populist' indicators)
party_codes_mapping <- read_csv(file.path(exdata_path, "party_codes_mapping.csv"))

## a) Tweet-level summary ----

parl_party_tweets_labeled <- read_rds(file.path(output_path, "parl_party_tweets_labeled.rds"))

paper_objects$tables$elite_criticism_X_political <- parl_party_tweets_labeled %>%
  count(political, elitecriticism) %>%
  group_by(political) %>%
  mutate(pct = round(n/sum(n)*100, 1)) %>%
  ungroup() %>%
  pivot_wider(names_from = elitecriticism, values_from = n:pct) %>%
  select(political, ends_with("yes"), ends_with("no"))

paper_objects$tables$elite_criticism_X_political_by_country <- parl_party_tweets_labeled %>% 
  count(country_iso3c, political, elitecriticism) %>% 
  group_by(country_iso3c, political) %>% 
  mutate(pct = round(n/sum(n)*100, 1)) %>% 
  ungroup() %>% 
  pivot_wider(names_from = elitecriticism, values_from = n:pct) %>% 
  select(country_iso3c, political, ends_with("yes"), ends_with("no"))

## b) Party-level summary ----

### party means plot ----

pdat <- parl_party_tweets_labeled %>% 
  filter(political == "yes") %>% 
  left_join(select(party_codes_mapping, party_id, party_name_short, to_keep, is_populist)) %>% 
  group_by(country_iso3c, party_id, party_name_short, to_keep, is_populist) %>% 
  summarise(mean_prob_elitecriticism = mean(prob_elitecriticism)) %>% 
  group_by(country_iso3c) %>%
  mutate(
    rank_ = dense_rank(mean_prob_elitecriticism)
    , is_populist = factor(is_populist, c("0", "1", "?"), c("no", "yes", "unknown"))
  ) %>% 
  ungroup()

tmp <- pdat %>% 
  mutate(
    keep_ = case_when(
      # only show European countries w/o GBR and IRL
      country_iso3c %in% c("AUS", "CAN", "NZL", "GBR", "IRL") ~ FALSE
      # for Spain: only keep PSOE, PP, Vox, Ciudadanos, Podemos 
      , country_iso3c == "ESP" & !(party_name_short %in% c("PSOE", "AP-P", "Vox", "C-PC", "P")) ~ FALSE
      # remove “unknown” category and parties
      , is_populist == "unknown" ~ FALSE
      , TRUE ~ TRUE
    )
  ) %>% 
  filter(keep_) 

party_averages_plots <- tmp %>% 
  split(.$country_iso3c) %>% 
  map(function(dat) {
    p <- dat %>% 
      filter(to_keep == "1") %>% 
      ggplot(
        aes(
          x = mean_prob_elitecriticism
          , y = reorder(party_name_short, mean_prob_elitecriticism)
          , color = is_populist
        )
      ) + 
      geom_point() +
      scale_x_continuous(breaks = seq(0, .5, .1)) +
      scale_color_brewer(
        breaks = c("no", "yes")
        , type = "qual", palette = "Dark2"
        , drop = FALSE
      ) +
      guides(color = "none") +
      expand_limits(x = c(0.00, .5)) +
      facet_wrap(~country_iso3c, strip.position = "top") +
      labs(x = NULL, y = NULL, color =  "Populist:") +
      theme(legend.position = "bottom", legend.direction = "horizontal")
    
    list(plot = p, n = length(unique(dat$party_id)))
  })

n_ <- map_int(party_averages_plots, "n")
(grps <- cumsum(n_) %/% ceiling(sum(n_)/4))
grps[length(grps)] <- 3L

party_averages_plots <- map(unique(grps), function(grp) {
  cntrs <- names(grps[grps == grp])
  
  p <- map(party_averages_plots[cntrs], "plot")
  p[[length(p)]]$labels$x <- "Anti-elite strategy"
  p[[length(p)]]$guides <- NULL
  p <- reduce(p, `+`)
  p + plot_layout(ncol = 1, heights = n_[cntrs])
})

paper_objects$figures$results$party_plots <- (
  (party_averages_plots[[1]] + guides(color = "none")) |
    (party_averages_plots[[2]] + guides(color = "none")) |
    (party_averages_plots[[3]] + guides(color = "none")) |
    party_averages_plots[[4]]
) +
  plot_layout(ncol = 4, guides = "collect") & 
  theme(legend.position = "bottom")

# difference in means (reported in text in main paper, first paragraph of subsection "Validating measurements")
paper_objects$misc$regressions$dim_populist_party_means <- lm(mean_prob_elitecriticism ~ is_populist, tmp)
summary(paper_objects$misc$regressions$dim_populist_party_means)
# note: see row 'is_populistyes'

## c) Time series ----

ts <- read_rds(file.path(output_path, "parl_party_tweets_elitecriticism_fixed_window_timeseries.rds"))

### tweets per time units distributions by country ----

tmp <- map_dfr(ts, select, !!c("country_iso3c", "n_tweets"), .id = "window") %>%
    mutate_at(1, str_to_title) %>% 
    mutate_at(1, factor, c("Halfyear", "Quarter", "Month", "Week")) 

p1 <- tmp %>% 
  ggplot(
    aes(
      x = reorder(country_iso3c, desc(country_iso3c))
      , fill = factor(n_tweets > 0, c(T, F), c("yes", "no"))
    )
  ) +
    geom_bar(position = "fill", width = .3) + 
    scale_fill_grey() +
    guides(fill = "none") + 
    coord_flip() +
    scale_y_continuous(
      breaks = c(.25, .5, .75)
    ) +
    facet_grid(cols = vars(window)) + 
    labs(
      subtitle = "(a) Tweets availability"
      , x = NULL, y = NULL, fill = "Tweets available"
    ) + 
    theme(
      plot.subtitle = element_text(hjust = .5)
      , axis.text.x = element_text(angle = 45, hjust = .7, vjust = .7)
    )

p2 <- tmp %>% 
  filter(n_tweets > 0) %>% 
  ggplot(aes(y = reorder(country_iso3c, desc(country_iso3c)), x = n_tweets, height = after_stat(density))) + 
    geom_density_ridges(fill = "black",  alpha = .8, stat = "binline", binwidth = .1, scale = 0.5, draw_baseline = FALSE, color = NA) + 
    scale_x_continuous(trans = "log10") +
    facet_grid(cols = vars(window)) + 
    labs(y = NULL, x = NULL, subtitle = "(b) Numbers of tweets in non-missing units") + 
    theme(
      plot.subtitle = element_text(hjust = .5)
      , axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)
    )

paper_objects$figures$results$n_tweets_distribution_by_country <- p1 + p2

### party-quarter missingness ----

paper_objects$figures$results$party_quarter_missingness <- ts$quarter %>% 
    filter(substr(quarter, 1, 4) >= 2008) %>% 
    ggplot(aes(y = party_name_short, x = quarter, fill = n_tweets > 0)) + 
    geom_tile(alpha = .75) + 
    scale_x_discrete(breaks = paste(2007:2020, "01", sep = "-"), labels = 2007:2020) +
    scale_y_discrete(breaks = NULL) +
    scale_fill_grey() +
    guides(fill = "none") +
    facet_grid(
      rows = vars(country_iso3c)
      , scales = "free_y"
      , space = "free_y"
    ) + 
    labs(
      x = NULL
      , y = NULL
    ) + 
    theme(strip.text.y = element_text(angle = 0))

### party-quarter plots for DEU, GBR, and SWE ----

selected_ts <- ts$quarter %>% 
  filter(country_iso3c %in% c("DEU", "SWE", "GBR")) %>% 
  filter(!party_name_short %in% c("UUP", "SDLP")) %>% 
  filter(substr(quarter, 1, 4) >= 2008) %>% 
  left_join(select(party_codes_mapping, party_id, to_keep, is_populist)) %>% 
  filter(to_keep == "1")

# hard-code ordering of parties within country
gbr_parties <- c("UKIP", "DUP", "Con", "Lib", "SNP", "Plaid", "Lab", "SF", "GP")
deu_parties <- c("AfD", "FDP", "CDU+CSU", "SPD", "B90/Gru", "Li/PDS")
swe_parties <- c("M", "KD", "FP", "C", "SD", "SAP", "MP", "JL", "V", "FI")

selected_ts$party_name_short <- factor(
  selected_ts$party_name_short,
  c(gbr_parties, deu_parties, swe_parties)
)

paper_objects$figures$results$paryt_quarter_selected <- selected_ts %>% 
  ggplot(
    aes(
      x = quarter
      , y = reorder(party_name_short, party_name_short)
      , fill = mean_prob_elitecriticism
    )
  ) +
  geom_raster(hjust = 1) +
  scale_x_discrete(name = NULL, breaks = paste0(2008:2020, "-01"), labels = 2008:2020) +
  scale_y_discrete(name = NULL) +
  viridis::scale_fill_viridis(
    name = "Anti-elite strategy"
    , begin = .1, end = .9, limits = c(0, .5)
    , option = "B", na.value = "grey80"
  ) + 
  facet_grid(rows = vars(country_iso3c), scales = "free_y", space = "free_y") + 
  theme(
    strip.text.y = element_text(angle = 0)
    , legend.position =  "bottom", legend.direction = "horizontal"
    , legend.margin = margin(), legend.box.margin = margin()
    , legend.key.height = unit(.1, "in")
    , legend.title = element_text(hjust = 0, vjust = .9)
  )

### all party-quarter plots ----

quarter_plots_data <- ts$quarter %>% 
  left_join(
    select(party_codes_mapping, country_iso3c, party_id, party_name_short, left_right_rank)
    , by = c("country_iso3c", "party_name_short", "party_id")
    , relationship = "many-to-many"
  ) %>% 
  filter(!is.na(left_right_rank)) %>% 
  filter(substr(quarter, 1, 4) >= 2008) %>% 
  split(.$country_iso3c)

paper_objects$figures$results$party_quarter_plots <- imap(
  quarter_plots_data
  , function(.data, .nm, .countries = with(paper_objects$data$countries, setNames(country_iso3c, country_name))) {
    nr_ <- which(.countries == .nm)
    
    p <- ggplot(.data, aes(y = reorder(party_name_short, desc(left_right_rank)), x = quarter, fill = mean_prob_elitecriticism)) +
      geom_tile() + 
      expand_limits(x = c("2008-01", "2020-01")) +
      scale_x_discrete(name = NULL, breaks = paste0(2008:2020, "-01"), labels = 2008:2020) +
      scale_y_discrete(name = NULL) +
      viridis::scale_fill_viridis(
        name = "Anti-elite strategy"
        , begin = .1, end = .9, limits = c(0, .5)
        , option = "B", na.value = "grey80"
      ) + 
      theme(
        plot.subtitle = element_text(hjust = .5)
        , legend.position = "none", legend.direction = "horizontal"
        , legend.margin = margin(t = .1, unit = "in"), legend.box.margin = margin()
        , legend.key.height = unit(.1, "in")
        , legend.title = element_text(hjust = 0, vjust = .9)
      ) + 
      labs(
        subtitle = sprintf(
          "(%s) %s"
          , letters[nr_], names(.countries)[nr_]
        )
      )
    list(plot = p, n_parties = length(unique(.data$party_name_short)))
  }
)

# 7. VALIDATION ----

paper_objects$desc$validation <- list()

## a) content validation (fighting words) ----

fp <- file.path(output_path, "validation", "fighting_words_xmlt_classifier.rds")
fighting_words <- read_rds(fp)

n_words <- 20L

pdat <- fighting_words %>% 
  map("summarized") %>% 
  map("topk") %>% 
  bind_rows(.id = "key") %>% 
  separate(key, c("country_iso3c", "lang"), sep = "-", remove = FALSE) %>% 
  arrange(country_iso3c, lang, desc(elitecriticism)) %>% 
  filter(elitecriticism == "yes") %>% 
  group_by(country_iso3c, lang) %>% 
  mutate(rank_ = row_number()) %>%
  filter(rank_ <= n_words) 

# English-speaking countries
paper_objects$figures$validation$fighting_words_en <- p_en <- pdat %>% 
  filter(key %in% paste(c("AUS", "CAN", "GBR", "IRL", "NZL"), "en", sep = "-")) %>%
  ggplot(aes(y = rank_, x = 1, fill = abs(z_score), label = feature)) + 
    geom_tile() + 
    geom_text(size = 2.5, color = "white") + 
    scale_x_continuous(breaks = NULL) + 
    scale_y_reverse(breaks = NULL) + 
    facet_wrap(~country_iso3c, nrow = 1) + 
    labs(x = NULL, y = NULL, fill = expression(italic(z)~score)) + 
    theme(legend.position = "bottom")

# German-speaking countries
p_de <- p_en
# p_de$data <- filter(pdat, key %in% paste(c("AUT", "CHE", "DEU", "LUX"), "de", sep = "-"))
p_de$data <- filter(pdat, key %in% paste(c("AUT", "CHE", "DEU"), "de", sep = "-"))
paper_objects$figures$validation$fighting_words_de <- p_de

# tweets from Spain
p_esp <- p_en
p_esp$data <- filter(pdat, country_iso3c == "ESP", lang != "pt") 
paper_objects$figures$validation$fighting_words_esp <- p_esp + facet_wrap(~lang, nrow = 1) 

## b) convergent validation against CHES ----

# data files for cross-validation against CHES estimates are created in file
#  "code/05-validate/02-validate_elitecriticism_estimates_against_ches.R"
ches_val_data <- read_rds(file.path(output_path, "validation", "party_averages_own_vs_ches_estimates.rds"))

paper_objects$tables$ches_correlations_detailed <- read_csv(file.path(output_path, "validation", "ches_party_correlations_detailed.csv"))

# correlations when dropping these
paper_objects$desc$validation$party_averages_mean_ches_correlations <- ches_val_data %>% 
  filter(n_tweets >= 100) %>% 
  group_by(year) %>% 
  summarise(
    corr = cor(mean_prob_elitecriticism, antielite_salience, use = "pairwise.complete.obs")
    , corr_keep = cor(mean_prob_elitecriticism[to_keep == 1L], antielite_salience[to_keep == 1L], use = "pairwise.complete.obs")
    # required for placement in plot
    , mean_prob_elitecriticism = max(mean_prob_elitecriticism)*.9
  ) %>% 
  mutate(
    lab = sprintf("r=%0.3f", corr) 
    # required for placement in plot
    , antielite_salience = 0.5
  )

paper_objects$figures$validate$ches_correlations_party_averages <- ches_val_data %>% 
  filter(n_tweets >= 100) %>% 
  ggplot(aes(x = mean_prob_elitecriticism, y = antielite_salience)) + 
    geom_point(size = .2, alpha = .5) + 
    geom_smooth(linewidth = .5, method = "lm", formula = y ~ x) + 
    geom_text(
      data = paper_objects$desc$validation$party_averages_mean_ches_correlations
      , mapping = aes(label = lab)
      , family = font_family
      , size = 3
    ) +
    # scale_x_continuous(limits = c(0, .5)) +
    scale_y_continuous(limits = c(0, 10)) +
    facet_grid(cols = vars(year), scales = "free_x") +
    labs(
      x = "Anti-elite strategy (Party-average estimates)"
      , y = "CHES: Anti-elite salience (mean)"
    )

## c) benchmarking: convergence with CHES compared to dictionaries ----

paper_objects$tables$dictionaries_overview <- read_tsv(file.path(exdata_path, "dictionaries_overview.tab"))

# for 12-months window
ches_party_averages <- read_rds(file.path(output_path, "validation", "party_averages_dictionaries_vs_ches_estimates.rds"))

methods_map <- c("XML-T classifier (ours)", "Gründl (2020)", "Rooduijn and Pauwels (2011)")

paper_objects$tables$dictionary_comparison <- ches_party_averages %>% 
  filter(to_keep == 1) %>% 
  group_by(year) %>%
  summarise(
    across(
      c(mean_prob_elitecriticism, prop_anitelite_dict_gruendl, prop_anitelite_dict_rooduijn)
      , ~cor(antielite_salience, ., use = "pairwise.complete.obs")
    )
  ) %>% 
  setNames(c("CHES wave", methods_map))

# for varying windows
correlations_by_n_quarters <- read_csv(file.path(output_path, "validation", "ches_dictionary_correlations_detailed.csv"))

# analyze
paper_objects$figures$validation$dictionary_comparison_by_quarters <- correlations_by_n_quarters %>% 
  pivot_longer(starts_with("corr_")) %>%
  mutate(
    name = factor(
      name
      , c("corr_clf", "corr_dict_gruendl", "corr_dict_rooduijn")
      , methods_map
    )
    , lab_ = ifelse(
      value > 0
      , sprintf("%.2f\n(%s)\n\n\n", value, year)
      , sprintf("\n\n\n%.2f\n(%s)", value, year)
    )
  ) %>% 
  ggplot(
    aes(
      y = value, ymin = 0, ymax = value
      , x = quarters
      , label = lab_
      , group = year
    )
  ) + 
  geom_linerange(position = position_dodge(1)) +
  geom_point(position = position_dodge(1)) + 
  geom_text(size = 2, position = position_dodge(1)) + 
  ylim(c(-.12, .9)) + 
  facet_grid(
    # cols = vars(year)
    cols = vars(name)
  ) + 
  labs(
    y = "Correlation with CHES estimates"
    , x = "Number of quarters"
  )


# 8. DOWNSTREAM ANALYSES ----

## a) mainstream vs challenger parties ----

parl_configs <- read_rds(file.path(input_path, "parl_configs_with_party_system_status.rds"))

# left-join tweets
tmp <- parl_party_tweets_labeled %>%
  left_join(parl_configs) %>%
  mutate(
    party = paste(party_name_short, party_id, sep = "--")
    , quarter = factor(sprintf("%d-%02d", year(created_at), quarter(created_at)))
  ) 

tmp <- tmp %>% 
  group_by(country_iso3c, party, party_name_short, quarter, is_chal) %>% 
  summarise(mean_prob_elitecriticism = mean(prob_elitecriticism, na.rm = TRUE)) %>% 
  ungroup()

paper_objects$figures$results$elitecriticism_party_type_by_country <- tmp %>%
  select(
    country_iso3c
    , party_name_short, quarter
    , is_chal
    , mean_prob_elitecriticism
  ) %>%
  ggplot(aes(x = country_iso3c, y = mean_prob_elitecriticism, fill = is_chal)) +
    geom_boxplot(outlier.color = NA, size = .5) +
    scale_y_continuous(limits = c(0, 0.5)) +
    scale_fill_manual(breaks = c(F, T), values = c("#dddddd", "#a3a3a3"), labels = c("mainstream", "challenger")) + 
    labs(x = NULL, y = NULL, fill = "Party type: ") + 
    theme(legend.position = "bottom")

## b) regressions on polls and coalition inclusion probability indicators ----

dat <- read_rds(file.path(output_path, "party_antielite_strategies_quarterly_data.rds"))

### describe polls data ----

paper_objects$data$polls$n_obs_by_country_and_quarter <- dat %>% 
  filter(!is.na(spolls_mean)) %>% 
  group_by(country_iso3c, quarter) %>% 
  summarize(
    n_obs = n()
    , parties = paste(sort(party_name_short), collapse = ", ")
  ) %>% 
  ungroup()

paper_objects$data$polls$n_obs_by_party <- dat %>% 
  filter(!is.na(spolls_mean)) %>% 
  group_by(country_iso3c, party_name_short) %>% 
  summarize(
    n_obs = n()
    , first_quarter = min(quarter)
    , last_quarter = max(quarter)
  ) %>% 
  ungroup()

paper_objects$figures$analyses$polls_distribution_by_country <- dat %>% 
  filter(!is.na(spolls_mean)) %>% 
  select(country_iso3c, quarter, party_name_short, spolls_mean) %>%
  ggplot(aes(x = spolls_mean, y = reorder(country_iso3c, desc(country_iso3c)))) + 
    geom_density_ridges(scale = .7, color = "lightgrey") + 
    labs(y = NULL, x = "Expected vote share in polls (quarter average)")


### describe CIP data ----

paper_objects$data$cip$n_obs_by_country_and_quarter <- dat %>% 
  filter(!is.na(cip_mean)) %>% 
  group_by(country_iso3c, quarter) %>% 
  summarize(
    n_obs = n()
    , parties = paste(sort(party_name_short), collapse = ", ")
  ) %>% 
  ungroup()

paper_objects$data$cip$n_obs_by_party <- dat %>% 
  filter(!is.na(cip_mean)) %>% 
  group_by(country_iso3c, party_name_short) %>% 
  summarize(
    n_obs = n()
    , first_quarter = min(quarter)
    , last_quarter = max(quarter)
  ) %>% 
  ungroup()
  
paper_objects$figures$analyses$cip_distribution_by_country <- dat %>% 
  filter(!is.na(cip_mean)) %>% 
  select(country_iso3c, quarter, party_name_short, cip_mean) %>%
  ggplot(aes(x = cip_mean, y = reorder(country_iso3c, desc(country_iso3c)))) + 
  geom_density_ridges(scale = .7, color = "lightgrey") + 
  labs(y = NULL, x = "Coalition inclusion probability (quarter average)")

### run regressions ----

# prepare the data
regression_data <- dat %>% 
  select(
    # unit indictaor
    country_iso3c, party_name_short, party_id, 
    # time indicator
    quarter,
    # DV
    mean_prob_elitecriticism, 
    # IDVs
    spolls_mean, cip_mean,
    # moderator
    is_chal
  ) %>% 
  distinct() %>% 
  # create unit ID
  mutate(
    uid = paste(country_iso3c, party_id, party_name_short, sep = "-")
  )

# Models 1a and 1b: Effect of polls on anti-elite strategies for challenger vs. mainstream parties

model1a <- plm(
  mean_prob_elitecriticism ~ lag(mean_prob_elitecriticism) + lag(spolls_mean),
  data = filter(regression_data, !is_chal), 
  index = c("uid", "quarter"),
  model = "within", 
  effect = "individual",
) 

model1b <- plm(
  mean_prob_elitecriticism ~ lag(mean_prob_elitecriticism) + lag(spolls_mean),
  data = filter(regression_data, is_chal), 
  index = c("uid", "quarter"),
  model = "within", 
  effect = "individual",
) 

paper_objects$misc$regressions$elitecriticism_polls <- list(
  "mainstream" = model1a, 
  "challenger" = model1b
)

# Effect of CIP on anti-elite strategies for challenger vs. mainstream parties

model2a <- plm(
  mean_prob_elitecriticism ~ lag(mean_prob_elitecriticism) + lag(cip_mean),
  data = filter(regression_data, !is_chal), 
  index = c("uid", "quarter"),
  model = "within", 
  effect = "individual",
) 

model2b <- plm(
  mean_prob_elitecriticism ~ lag(mean_prob_elitecriticism) + lag(cip_mean),
  data = filter(regression_data, is_chal), 
  index = c("uid", "quarter"),
  model = "within", 
  effect = "individual",
) 

paper_objects$misc$regressions$elitecriticism_cip <- list(
  "mainstream" = model2a, 
  "challenger" = model2b
)


# FINALLY: save to disk ----

write_rds(paper_objects, paper_objects_fp)
